library(ggplot2)
library(readxl)
library(readr)
library(tidyverse)
library(dplyr)
library(sf)
library(plotly)
library(geojsonio)

load dataset

df_descriptive=read_csv("filtered_merged_dataset_sample.csv")
data_final <- read_csv("data_final.csv")

Total incidents per NTA

Load spatial data (replace with actual shapefile path)

cdta_shape = st_read("nycdta2020_24d/nycdta2020.shp")
## Reading layer `nycdta2020' from data source 
##   `/Users/wangmingyin/Desktop/data science 1/Final_website/nycdta2020_24d/nycdta2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 71 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
boro_shape = st_read("Borough Boundaries/geo_export_391a75ed-0ae4-4c88-8c30-3588c75bd01e.shp")
## Reading layer `geo_export_391a75ed-0ae4-4c88-8c30-3588c75bd01e' from data source `/Users/wangmingyin/Desktop/data science 1/Final_website/Borough Boundaries/geo_export_391a75ed-0ae4-4c88-8c30-3588c75bd01e.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 5 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25559 ymin: 40.49613 xmax: -73.70001 ymax: 40.91553
## Geodetic CRS:  WGS84(DD)

CDTA cleaning

# There are 3 NA in the dataset, since there are no incident in these CDTAs
# Identify Missing Matches
unmatched_cdta <- setdiff(cdta_shape$CDTA2020, data_final$CDTA)
 

# Assign 0 to Missing Areas
cdta_incident_counts <- data_final %>%
  group_by(CDTA) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop") %>%
  complete(CDTA = unique(cdta_shape$CDTA2020), fill = list(Number_of_Incidents = 0))





#Re-Merge the Data
cdta_map_data <- cdta_shape %>%
  left_join(cdta_incident_counts, by = c("CDTA2020" = "CDTA"))

# Update NA Handling
cdta_map_data <- cdta_map_data %>%
  mutate(
    Number_of_Incidents = ifelse(is.na(Number_of_Incidents), 0, Number_of_Incidents),
    Incident_Range = cut(
      Number_of_Incidents,
      breaks = seq(0, 600, by = 120), 
      labels = c("0-120", "121-240", "241-360", "361-480", "481-600"),
      include.lowest = TRUE
    )
  )
# Count the boro incident
boro_incident_counts <- data_final %>%
  group_by(BORO) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")  %>%
  mutate(BORO = tolower(BORO) )

# Lowercase the boro in boro_shape
boro_shape = boro_shape %>%
  mutate(boro_name = tolower(boro_name))
 

# Merge spatial data with incident counts
boro_map_data <- boro_shape %>%
  left_join(boro_incident_counts, by = c("boro_name" = "BORO"))

# Create custom breaks for Number_of_Incidents
boro_map_data <- boro_map_data %>%
  mutate(
    Incident_Range = cut(
      Number_of_Incidents,
      breaks = seq(0, 4000, by = 800),  # Breaks from 0 to 4000, every 800 cases
      labels = c("0-800", "801-1600", "1601-2400", "2401-3200", "3201-4000"),
      include.lowest = TRUE
    )
  )
boro_map_data <- boro_map_data %>%
  mutate(hover_text = paste("Borough:", boro_name,
                            "<br>Number of Incidents:", Number_of_Incidents))
# Interactive plot using plotly
plot <- plot_ly() %>%
  add_sf(
    data = boro_map_data,
    split = ~boro_name,  # Separate polygons by boroughs
    color = ~Number_of_Incidents,  # Color based on the number of incidents
    colors = "viridis",  # Color palette
    text = ~hover_text,  # Hover text
    hoverinfo = "text"
  ) %>%
  layout(
    title = "Total Number of Incidents Across NYC BOROs (2017-2023)",
    geo = list(
      resolution = 50,
      showland = TRUE,
      landcolor = "rgb(217, 217, 217)",
      showlakes = TRUE,
      lakecolor = "rgb(173, 216, 230)",
      projection = list(type = "mercator")
    )
  )

# Display the plot
plot
cdta_shape = st_read("nycdta2020_24d/nycdta2020.shp")
## Reading layer `nycdta2020' from data source 
##   `/Users/wangmingyin/Desktop/data science 1/Final_website/nycdta2020_24d/nycdta2020.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 71 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
## There is space between letter and number in CDTA, I deleted the space below
data_final$CDTA <- gsub(" ", "", data_final$CDTA)

cdta_incident_counts <- data_final %>%
  group_by(CDTA) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")

# Prepare incident data: count incidents per NTA_clean
cdta_incident_counts <- data_final %>%
  group_by(CDTA) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")



# Filter out borough
boroughs <- unique(cdta_map_data$BoroName)

CDTA map for NYC

# Plot the map with custom ranges
ggplot(data = boro_map_data) +
  geom_sf(aes(fill = Incident_Range), color = "white", size = 0.2) +
  geom_sf_text(aes(label = Number_of_Incidents), size = 3, color = "black") +  # Add labels
  scale_fill_manual(
    values = c(
      "0-800" = "#b2e2e2",
      "801-1600" = "skyblue",
      "1601-2400" = "#66c2a4",
      "2401-3200" = "#2ca25f",
      "3201-4000" = "#006d2c"
    ),
    name = "Number of Incidents"
  ) +
  labs(
    title = "Total Number of Incidents Across NYC BOROs from 2017 to 2023",
    subtitle = "Incidents grouped by range (0-4000, 800 breaks)",
    caption = "Data Source: Your dataset"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    axis.title.x = element_blank(),  # Remove x-axis label
    axis.title.y = element_blank()
  )

CDTA map for each boro

# Plot
for (b in boroughs) {
    borough_data <- cdta_map_data %>%
        filter(BoroName == b)
    plot <- ggplot(data = borough_data) +
        geom_sf(aes(fill = Number_of_Incidents), color = "black") +
      geom_sf_text(aes(label = Number_of_Incidents), size = 3, color = "black") +  # Add labels+
        scale_fill_gradientn(
      colors = c("blue", "green", "yellow", "red"), # Custom color scale
      name = "Number of Incidents"
    ) +
        labs(
            title = paste("CDTA Incidents in", b),
            subtitle = "2017 to 2023",
            x = "Longitude",
            y = "Latitude"
        ) +
        theme_minimal()
    print(plot)  # Move inside the loop
}

summarize CDTA and BORO

## There is space between letter and number in CDTA, I deleted the space below
data_final$CDTA <- gsub(" ", "", data_final$CDTA)

cdta_incident_counts <- data_final %>%
  group_by(CDTA) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")

merge datasets

# Prepare incident data: count incidents per NTA_clean
cdta_incident_counts <- data_final %>%
  group_by(CDTA) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")

# Merge spatial data with incident counts
cdta_map_data <- cdta_shape %>%
  left_join(cdta_incident_counts, by = c("CDTA2020" = "CDTA"))

# Create custom breaks for Number_of_Incidents
cdta_map_data <- cdta_map_data %>%
  mutate(
    Incident_Range = cut(
      Number_of_Incidents,
      breaks = seq(0, 400, by = 80),  # Breaks from 0 to 1000, every 200 cases
      labels = c("0-80", "81-160", "161-240", "241-320", "321-400"),
      include.lowest = TRUE
    )
  )
# Plot the map with custom ranges
ggplot(data = cdta_map_data) +
  geom_sf(aes(fill = Incident_Range), color = "white", size = 0.2) +
  geom_sf_text(aes(label = Number_of_Incidents), size = 3, color = "black") +  # Add labels
  scale_fill_manual(
    values = c(
      "0-80" = "#b2e2e2",
      "81-160" = "skyblue",
      "161-240" = "#66c2a4",
      "241-320" = "#2ca25f",
      "321-400" = "#006d2c"
    ),
    name = "Number of Incidents"
  ) +
  labs(
    title = "Total Number of Incidents Across NYC CDTAs from 2017 to 2023",
    subtitle = "Incidents grouped by range (0-400, 80 breaks)",
    caption = "Data Source: Your dataset"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(), 
    axis.title.x = element_blank(),  # Remove x-axis label
    axis.title.y = element_blank()
  )

# Count the boro incident
boro_incident_counts <- data_final %>%
  group_by(BORO) %>%
  summarise(Number_of_Incidents = n(), .groups = "drop")  %>%
  mutate(BORO = tolower(BORO) )

# Lowercase the boro in boro_shape
boro_shape = boro_shape %>%
  mutate(boro_name = tolower(boro_name))
 

# Merge spatial data with incident counts
boro_map_data <- boro_shape %>%
  left_join(boro_incident_counts, by = c("boro_name" = "BORO"))

# Create custom breaks for Number_of_Incidents
boro_map_data <- boro_map_data %>%
  mutate(
    Incident_Range = cut(
      Number_of_Incidents,
      breaks = seq(0, 4000, by = 800),  # Breaks from 0 to 4000, every 800 cases
      labels = c("0-800", "801-1600", "1601-2400", "2401-3200", "3201-4000"),
      include.lowest = TRUE
    )
  )
# Plot the map with custom ranges
ggplot(data = boro_map_data) +
  geom_sf(aes(fill = Incident_Range), color = "white", size = 0.2) +
  geom_sf_text(aes(label = Number_of_Incidents), size = 3, color = "black") +  # Add labels
  scale_fill_manual(
    values = c(
      "0-800" = "#b2e2e2",
      "801-1600" = "skyblue",
      "1601-2400" = "#66c2a4",
      "2401-3200" = "#2ca25f",
      "3201-4000" = "#006d2c"
    ),
    name = "Number of Incidents"
  ) +
  labs(
    title = "Total Number of Incidents Across NYC BOROs from 2017 to 2023",
    subtitle = "Incidents grouped by range (0-4000, 800 breaks)",
    caption = "Data Source: Your dataset"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    axis.title.x = element_blank(),  # Remove x-axis label
    axis.title.y = element_blank()
  )